import pandas as pd
import json
import numpy as np
from itertools import repeat
import re
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
from concurrent.futures import wait as futures_wait
from concurrent.futures.process import ProcessPoolExecutor
import importlib
import hashtag_util as ut
import sys
sys.path.insert(0, '../')
import general_utils as gen_ut
df = pd.read_csv('../tweets_proVax.csv',low_memory=False,
usecols=['hashtags','created_at'])
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")
#Creating a map of all hashtags with the number of uses
listHashtags = []
for s in df['hashtags']:
[ listHashtags.append(x) for x in gen_ut.get_string_json(s,'text') ]
dfHashtags = pd.DataFrame()
dfHashtags['hashtags'] = listHashtags
dfHashtags['count'] = 0
dfHashtags = dfHashtags.groupby('hashtags').count()
dfHashtags.sort_values(['count'],axis = 0,inplace=True,ascending=False)
dfHashtags
| count | |
|---|---|
| hashtags | |
| vaccino | 32371 |
| vaccini | 26628 |
| AstraZeneca | 16243 |
| COVID19 | 15841 |
| coronavirus | 9440 |
| ... | ... |
| Tgrin60secondi | 1 |
| Tgtg | 1 |
| Thanks | 1 |
| Thanksgiving | 1 |
| Verifica | 1 |
19114 rows × 1 columns
importlib.reload(ut)
dfUse = ut.process_dfUse(df)
dfUse
| Week/Year | hashtag | count | |
|---|---|---|---|
| 37790 | 2020-01-06 | Bergamo | 1 |
| 37809 | 2020-01-06 | Zaia | 1 |
| 37808 | 2020-01-06 | WW3 | 1 |
| 37807 | 2020-01-06 | VACCINI | 2 |
| 37805 | 2020-01-06 | Rompilatrasmissione | 1 |
| ... | ... | ... | ... |
| 6407 | 2021-05-17 | PalazzoBraschi | 1 |
| 6408 | 2021-05-17 | Palermo | 2 |
| 6409 | 2021-05-17 | Palestina | 1 |
| 6437 | 2021-05-17 | Quirinale | 1 |
| 6132 | 2021-05-17 | 18Maggio | 7 |
52617 rows × 3 columns
importlib.reload(ut)
ut.visual_histogram(dfHashtags,3500,3000,2500,2000,1500,1000)
ut.visual_by_date_together(dfHashtags,dfUse)
ut.visual_by_date_split(dfHashtags,dfUse)
hastagRemove = ['vaccin.*','covid.*','corona.*','astrazeneca','pfizer','sarscov2','sputnikv','moderna']
dfHashtagFiltered = dfHashtags
for r in hastagRemove:
mask = dfHashtagFiltered.index.str.lower().str.match(r) == True
dfHashtagFiltered.drop(dfHashtagFiltered[mask].index, inplace=True)
dfHashtagFiltered
| count | |
|---|---|
| hashtags | |
| Lombardia | 6225 |
| Draghi | 5247 |
| Arcuri | 4992 |
| Mattarella | 4010 |
| ANSA | 3980 |
| ... | ... |
| Tgrin60secondi | 1 |
| Tgtg | 1 |
| Thanks | 1 |
| Thanksgiving | 1 |
| Verifica | 1 |
18384 rows × 1 columns
ut.visual_histogram(dfHashtagFiltered,1000,500)
ut.visual_by_date_together(dfHashtagFiltered,dfUse)
ut.visual_by_date_split(dfHashtagFiltered,dfUse)
dfMoreFiltered = dfHashtagFiltered
hastagRemove = ['.*lombardia.*','draghi','conte','m5s','mattarella','salvini','speranza','renzi','lega','.*governo.*',
'.*moratti.*','zingaretti','scanzi','burioni','crisanti']
for r in hastagRemove:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfMoreFiltered.drop(dfMoreFiltered[mask].index, inplace=True)
dfMoreFiltered
| count | |
|---|---|
| hashtags | |
| Arcuri | 4992 |
| ANSA | 3980 |
| Figliuolo | 3652 |
| novax | 2916 |
| Italia | 2149 |
| ... | ... |
| Tgrin60secondi | 1 |
| Tgtg | 1 |
| Thanks | 1 |
| Thanksgiving | 1 |
| Verifica | 1 |
17943 rows × 1 columns
ut.visual_histogram(dfMoreFiltered,1000,500)
ut.visual_by_date_together(dfMoreFiltered,dfUse)
ut.visual_by_date_split(dfMoreFiltered,dfUse)
listHashtagsStudy = ['5g','billgates','dittatura*.','disobbedisco','nessunacorrelazione','byoblu*.']
dfSuspect = pd.DataFrame(index=listHashtagsStudy)
for r in listHashtagsStudy:
mask = dfMoreFiltered.index.str.lower().str.match(r) == True
dfSuspect.loc[r,'count'] = sum(dfMoreFiltered.loc[mask,'count'])
dfSuspect.sort_values('count',inplace=True)
fig = px.histogram(y=dfSuspect.index, x=dfSuspect['count']*100/sum(dfHashtags['count']), orientation='h')
fig.update_layout(title="Use of suspect hashtag (as a proportion of the total)]")
fig.update_yaxes(title="Hashtag")
fig.update_xaxes(title="Usage percent")
fig.show()
fig = px.histogram(y=dfSuspect.index, x=dfSuspect['count'], orientation='h')
fig.update_layout(title="Use of suspect hashtag (total = %d)"%sum(dfHashtags['count']))
fig.update_yaxes(title="Hashtag")
fig.update_xaxes(title="Usage")
fig.show()
ut.visual_by_date_together(dfSuspect,dfUse)
df = pd.read_csv('../tweets_proVax.csv',low_memory=False,
usecols=['hashtags','user_screen_name'])
df
dfUseSus = ut.process_df_uses_hashtags(df,dfSuspect.index)
dfUseSus = dfUseSus.groupby('user').any()
dfUseSus = ut.hashtagAND(dfSuspect.index,dfUseSus)
for i in range(1,len(listHashtagsStudy)):
dfUseSus = ut.hashtagOR(dfSuspect.index,dfUseSus,'OR'+str(i),i)
dfUseSus
print("Number of account high credibility that uses at least i hashtags")
for i in range (1,6):
or_i = "OR%d"%i
dfUseHashtagNovax = dfUseSus[dfUseSus[or_i]]
print("\ti =",i,":\t",(len(dfUseHashtagNovax) / len(dfUseSus))*100,"%")
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')
import os
if os.system("jupyter nbconvert %s --to html"%nb_name)==0:
print("Notebook converted correctly")
else:
print("Notebook convertion had an error")